資料彙整流程

1. 交易項目計錄:Z

rm(list=ls(all=T))
Sys.setlocale("LC_TIME","C")
## [1] "C"
pacman::p_load(magrittr, readr, caTools, ggplot2, dplyr)
1.1 讀進資料
Z = read_csv("data/ta_feng_all_months_merged.csv") %>% data.frame %>% 
  setNames(c("date","cust","age","area","cat","prod","qty","cost","price"))
## Parsed with column specification:
## cols(
##   TRANSACTION_DT = col_character(),
##   CUSTOMER_ID = col_character(),
##   AGE_GROUP = col_character(),
##   PIN_CODE = col_character(),
##   PRODUCT_SUBCLASS = col_double(),
##   PRODUCT_ID = col_character(),
##   AMOUNT = col_double(),
##   ASSET = col_double(),
##   SALES_PRICE = col_double()
## )
nrow(Z)
## [1] 817741
資料格式轉換
Z$date = as.Date(Z$date, format="%m/%d/%Y")
Z$age[is.na(Z$age)] = "na"
Z$age = factor(Z$age, levels=c(
  "<25","25-29","30-34","35-39","40-44","45-49","50-54","55-59","60-64",">65","na"), labels=c(
  "a20","a25","a30","a35","a40","a45","a50","a55","a60","a65","na"
  )) %>% as.character
Z$area = paste0("z",Z$area)
summary(Z)
##       date                cust               age           
##  Min.   :2000-11-01   Length:817741      Length:817741     
##  1st Qu.:2000-11-28   Class :character   Class :character  
##  Median :2001-01-01   Mode  :character   Mode  :character  
##  Mean   :2000-12-30                                        
##  3rd Qu.:2001-01-30                                        
##  Max.   :2001-02-28                                        
##      area                cat             prod                qty          
##  Length:817741      Min.   :100101   Length:817741      Min.   :   1.000  
##  Class :character   1st Qu.:110106   Class :character   1st Qu.:   1.000  
##  Mode  :character   Median :130106   Mode  :character   Median :   1.000  
##                     Mean   :284950                      Mean   :   1.382  
##                     3rd Qu.:520314                      3rd Qu.:   1.000  
##                     Max.   :780510                      Max.   :1200.000  
##       cost              price         
##  Min.   :     0.0   Min.   :     1.0  
##  1st Qu.:    35.0   1st Qu.:    42.0  
##  Median :    62.0   Median :    76.0  
##  Mean   :   112.1   Mean   :   131.9  
##  3rd Qu.:   112.0   3rd Qu.:   132.0  
##  Max.   :432000.0   Max.   :444000.0
處理離群值
# Quantile of Variables
sapply(Z[,7:9], quantile, prob=c(.99, .999, .9995))
##        qty   cost   price
## 99%      6  858.0 1014.00
## 99.9%   14 2722.0 3135.82
## 99.95%  24 3799.3 3999.00
# Remove Outliers
Z = subset(Z, qty<=24 & cost<=3800 & price<=4000) 
nrow(Z)  
## [1] 817182
彙總訂單 Assign Transaction ID
Z$tid = group_indices(Z, date, cust) # same customer same day
資料總覽
# No. cust, cat, prod, tid
sapply(Z[c("cust","cat","prod","tid")], n_distinct)
##   cust    cat   prod    tid 
##  32256   2007  23789 119422
# Summary of Item Records
summary(Z)
##       date                cust               age           
##  Min.   :2000-11-01   Length:817182      Length:817182     
##  1st Qu.:2000-11-28   Class :character   Class :character  
##  Median :2001-01-01   Mode  :character   Mode  :character  
##  Mean   :2000-12-30                                        
##  3rd Qu.:2001-01-30                                        
##  Max.   :2001-02-28                                        
##      area                cat             prod                qty        
##  Length:817182      Min.   :100101   Length:817182      Min.   : 1.000  
##  Class :character   1st Qu.:110106   Class :character   1st Qu.: 1.000  
##  Mode  :character   Median :130106   Mode  :character   Median : 1.000  
##                     Mean   :284784                      Mean   : 1.358  
##                     3rd Qu.:520311                      3rd Qu.: 1.000  
##                     Max.   :780510                      Max.   :24.000  
##       cost            price             tid        
##  Min.   :   0.0   Min.   :   1.0   Min.   :     1  
##  1st Qu.:  35.0   1st Qu.:  42.0   1st Qu.: 28783  
##  Median :  62.0   Median :  76.0   Median : 59391  
##  Mean   : 106.2   Mean   : 125.5   Mean   : 58845  
##  3rd Qu.: 112.0   3rd Qu.: 132.0   3rd Qu.: 87391  
##  Max.   :3798.0   Max.   :4000.0   Max.   :119422


2. 交易計錄:X

交易資料彙整
X = Z %>% group_by(tid) %>% summarise(
  date = date[1],             # 交易日期  
  cust = cust[1],             # 顧客 ID
  age = age[1],               # 顧客 年齡級別
  area = area[1],             # 顧客 居住區別
  items = n(),                # 交易項目(總)數
  pieces = sum(qty),          # 產品(總)件數
  total = sum(price),         # 交易(總)金額
  gross = sum(price - cost)   # 毛利
  ) %>% data.frame
nrow(X) # 119422                 
## [1] 119422
交易摘要
summary(X)    
##       tid              date                cust          
##  Min.   :     1   Min.   :2000-11-01   Length:119422     
##  1st Qu.: 29856   1st Qu.:2000-11-29   Class :character  
##  Median : 59712   Median :2001-01-01   Mode  :character  
##  Mean   : 59712   Mean   :2000-12-31                     
##  3rd Qu.: 89567   3rd Qu.:2001-02-02                     
##  Max.   :119422   Max.   :2001-02-28                     
##      age                area               items             pieces       
##  Length:119422      Length:119422      Min.   :  1.000   Min.   :  1.000  
##  Class :character   Class :character   1st Qu.:  2.000   1st Qu.:  3.000  
##  Mode  :character   Mode  :character   Median :  5.000   Median :  6.000  
##                                        Mean   :  6.843   Mean   :  9.294  
##                                        3rd Qu.:  9.000   3rd Qu.: 12.000  
##                                        Max.   :112.000   Max.   :339.000  
##      total           gross        
##  Min.   :    5   Min.   :-1645.0  
##  1st Qu.:  227   1st Qu.:   21.0  
##  Median :  510   Median :   68.0  
##  Mean   :  859   Mean   :  132.3  
##  3rd Qu.: 1082   3rd Qu.:  169.0  
##  Max.   :30171   Max.   : 8069.0
處理離群值
# Check Quantile & Remove Outliers
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
##        items   pieces     total    gross
## 99.9%     54  81.0000  9009.579 1824.737
## 99.95%    62  94.2895 10611.579 2179.817
## 99.99%    82 133.0000 16044.401 3226.548
# Remove Outliers
X = subset(X, items<=62 & pieces<95 & total<16000) # 119328
每周交易次數
par(cex=0.8)
hist(X$date, "weeks", freq=T, las=2, main="No. Transaction per Week")


3. 顧客資料:A

顧客資料彙整
d0 = max(X$date) + 1
A = X %>% mutate(
  days = as.integer(difftime(d0, date, units="days"))
  ) %>% 
  group_by(cust) %>% summarise(
    r = min(days),      # recency
    s = max(days),      # seniority
    f = n(),            # frquency
    m = mean(total),    # monetary
    rev = sum(total),   # total revenue contribution
    raw = sum(gross),   # total gross profit contribution
    age = age[1],       # age group
    area = area[1],     # area code
  ) %>% data.frame      # 33241
nrow(A)
## [1] 32241
顧客摘要
summary(A) 
##      cust                 r                s                f         
##  Length:32241       Min.   :  1.00   Min.   :  1.00   Min.   : 1.000  
##  Class :character   1st Qu.:  9.00   1st Qu.: 56.00   1st Qu.: 1.000  
##  Mode  :character   Median : 26.00   Median : 92.00   Median : 2.000  
##                     Mean   : 37.45   Mean   : 80.78   Mean   : 3.701  
##                     3rd Qu.: 60.00   3rd Qu.:110.00   3rd Qu.: 4.000  
##                     Max.   :120.00   Max.   :120.00   Max.   :85.000  
##        m                rev              raw              age           
##  Min.   :    8.0   Min.   :     8   Min.   : -784.0   Length:32241      
##  1st Qu.:  365.0   1st Qu.:   707   1st Qu.:   75.0   Class :character  
##  Median :  705.7   Median :  1750   Median :  241.0   Mode  :character  
##  Mean   :  993.1   Mean   :  3152   Mean   :  484.6                     
##  3rd Qu.: 1291.0   3rd Qu.:  3968   3rd Qu.:  612.0                     
##  Max.   :12636.0   Max.   :127686   Max.   :20273.0                     
##      area          
##  Length:32241      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
RFM分群
#做出RF分佈圖
# 切割頻率
orders.segm <- A %>%
  mutate(buy_freq=ifelse(between(f, 1, 10), '1',
                          ifelse(between(f, 11, 20), '2',
                                 ifelse(between(f, 21, 30), '3',
                                        ifelse(between(f, 31, 40), '4',
                                               ifelse(between(f, 41, 50), '5', '>5')))))) %>%
  
  
  # 切割近因畫出邊界
  mutate(segm.rec=ifelse(between(r, 0, 7), '0-7 days',
                         ifelse(between(r, 8, 15), '8-15 days',
                                ifelse(between(r, 16, 22), '16-22 days',
                                       ifelse(between(r, 23, 30), '23-30 days',
                                              ifelse(between(r, 31, 55), '31-55 days', '>55 days')))))) %>%
  arrange(A$cust)


# 定義邊界的順序
orders.segm$buy_freq <- factor(orders.segm$buy_freq, levels=c('>5', '5', '4', '3', '2', '1'))
orders.segm$segm.rec <- factor(orders.segm$segm.rec, levels=c('>55 days', '31-55 days', '23-30 days', '16-22 days', '8-15 days', '0-7 days'))



lcg <- orders.segm %>%
  group_by(segm.rec, buy_freq) %>%
  summarise(quantity=n()) %>%
  mutate(client='customers') %>%
  ungroup()

lcg.matrix= as.data.frame.matrix(table(orders.segm$buy_freq, orders.segm$segm.rec))
lcg.matrix$buy_freq = row.names(lcg.matrix) 
lcg.matrix
##    >55 days 31-55 days 23-30 days 16-22 days 8-15 days 0-7 days buy_freq
## >5        0          0          0          0         0       38       >5
## 5         0          0          0          0         0       46        5
## 4         0          0          0          0         9       68        4
## 3         0          2          2          5        40      254        3
## 2        15         31         34         69       372      935        2
## 1      8639       6223       2530       2817      4418     5694        1
# 繪製RFM分析圖
lcg.adv <- lcg %>%
  mutate(rec.type = ifelse(segm.rec %in% c(">55 days", "31-55 days", "23-30 days"), "not recent", "recent"),
         freq.type = ifelse(buy_freq %in% c(">5", "5", "4"), "frequent", "infrequent"),
         customer.type = interaction(rec.type, freq.type))

ggplot(lcg.adv, aes(x=client, y=quantity, fill=customer.type)) +
  theme_bw() +
  geom_rect(aes(fill = customer.type), xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf, alpha = 0.1) +
  facet_grid(buy_freq ~ segm.rec) +
  geom_bar(stat='identity', alpha=0.7) +
  geom_text(aes(y=max(quantity)/2, label=quantity), size=4) +
  ggtitle("R&F Analysis Graphics") +
  xlab("Days to last time purchase") + ylab("Purchase Frequency")+ 
  guides(fill=guide_legend(title="Group color"))+
  scale_fill_discrete(name="Experimental\nCondition",breaks = c('not recent.frequent','recent.frequent','not recent.infrequent','recent.infrequent'), labels = c('Former Customers','Frequent-coming Customers','One-time Customers','New Customers'))

lcg.sub <- orders.segm %>%
  group_by(segm.rec, buy_freq) %>%
  summarise(quantity=n()) %>%
  mutate(client='customers') %>%
  ungroup()
#細部切割10個客群
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
set.seed(111)
A$grp = kmeans(scale(A[,c(2:5)]),10)$cluster
table(A$grp)  # 族群大小
## 
##    1    2    3    4    5    6    7    8    9   10 
## 3247 1653 5239  337 2381 4706 1145 6470 2788 4275
#RFM分群泡泡圖
group_by(A, grp) %>% summarise(
  recent=mean(r), 
  freq=mean(f), 
  money=mean(m), 
  size=n() ) %>% 
  mutate( revenue = size*money/1000 )  %>% 
  filter(size > 1) %>% 
  ggplot(aes(x=freq, y=money)) +
  geom_point(aes(size=revenue, col=recent),alpha=0.5) +
  scale_size(range=c(4,30)) +
  scale_color_gradient(low="green",high="red") +
  scale_x_log10() + scale_y_log10(limits=c(30,5000)) + 
  geom_text(aes(label = size ),size=3) +
  theme_bw() + guides(size=F) +
  labs(title="Customer Segements",
       subtitle="(bubble_size:revenue_contribution; text:group_size)",
       color="Recency") +
  xlab("Frequency (log)") + ylab("Average Transaction Amount (log)")

ggplotly()
#分佈情形
par(mfrow=c(3,2), mar=c(3,3,4,2))
for(x in c('r','s','f','m')) 
  hist(A[,x],freq=T,main=x,xlab="",ylab="",cex.main=2)
hist(pmin(A$f,10),0:10,freq=T,xlab="",ylab="",cex.main=2)
hist(log(A$m,10),freq=T,xlab="",ylab="",cex.main=2)

#定位分群
STS = c("菜雞","積優股","冤大頭","提款機","沈思者","大雄","Almost葛屁")
Status = function(rx,fx,mx,sx,K) {factor(
  ifelse(sx < 3*K,
         ifelse(fx*mx > 1600, "積優股", "菜雞"),
         ifelse(rx < 2*K,
                ifelse(sx/fx < 0.41*K,"提款機","冤大頭"),
                ifelse(rx < 4*K,"沈思者",
                       ifelse(rx < 5*K,"大雄","Almost葛屁")))), STS)}
K = as.integer(sum(A$s[A$f>1]) / sum(A$f[A$f>1])); K
## [1] 17
Check & Save
is.na(Z) %>% colSums
##  date  cust   age  area   cat  prod   qty  cost price   tid 
##     0     0     0     0     0     0     0     0     0     0
is.na(X) %>% colSums
##    tid   date   cust    age   area  items pieces  total  gross 
##      0      0      0      0      0      0      0      0      0
is.na(A) %>% colSums
## cust    r    s    f    m  rev  raw  age area  grp 
##    0    0    0    0    0    0    0    0    0    0
A0 = A; X0 = X; Z0 = Z
save(Z0, X0, A0, file="data/tf0.rdata")


Preparing The Predictors (X)

Sys.setlocale("LC_TIME","C")
## [1] "C"
pacman::p_load(magrittr, readr, caTools, ggplot2, dplyr)
load("data/tf0.rdata")
The Demarcation Date

Remove data after the demarcation date

feb01 = as.Date("2001-02-01")
Z = subset(Z0, date < feb01)    # 618212
Aggregate for the Transaction Records
X = group_by(Z, tid) %>% summarise(
  date = first(date),  # 交易日期
  cust = first(cust),  # 顧客 ID
  age = first(age),    # 顧客 年齡級別
  area = first(area),  # 顧客 居住區別
  items = n(),                # 交易項目(總)數
  pieces = sum(qty),          # 產品(總)件數
  total = sum(price),         # 交易(總)金額
  gross = sum(price - cost)   # 毛利
  ) %>% data.frame  # 88387
summary(X)
##       tid             date                cust          
##  Min.   :    1   Min.   :2000-11-01   Length:88387      
##  1st Qu.:22098   1st Qu.:2000-11-23   Class :character  
##  Median :44194   Median :2000-12-12   Mode  :character  
##  Mean   :44194   Mean   :2000-12-15                     
##  3rd Qu.:66290   3rd Qu.:2001-01-12                     
##  Max.   :88387   Max.   :2001-01-31                     
##      age                area               items             pieces       
##  Length:88387       Length:88387       Min.   :  1.000   Min.   :  1.000  
##  Class :character   Class :character   1st Qu.:  2.000   1st Qu.:  3.000  
##  Mode  :character   Mode  :character   Median :  5.000   Median :  6.000  
##                                        Mean   :  6.994   Mean   :  9.453  
##                                        3rd Qu.:  9.000   3rd Qu.: 12.000  
##                                        Max.   :112.000   Max.   :339.000  
##      total             gross        
##  Min.   :    5.0   Min.   :-1645.0  
##  1st Qu.:  230.0   1st Qu.:   23.0  
##  Median :  522.0   Median :   72.0  
##  Mean   :  888.7   Mean   :  138.3  
##  3rd Qu.: 1120.0   3rd Qu.:  174.0  
##  Max.   :30171.0   Max.   : 8069.0
Check Quantile and Remove Outlier
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
##          items   pieces     total    gross
## 99.9%  56.0000  84.0000  9378.684 1883.228
## 99.95% 64.0000  98.0000 11261.751 2317.087
## 99.99% 85.6456 137.6456 17699.325 3389.646
X = subset(X, items<=64 & pieces<=98 & total<=11260) # 88387 -> 88295
Aggregate for Customer Records
d0 = max(X$date) + 1
A = X %>% mutate(
  days = as.integer(difftime(d0, date, units="days"))
  ) %>% 
  group_by(cust) %>% summarise(
    r = min(days),      # recency
    s = max(days),      # seniority
    f = n(),            # frquency
    m = mean(total),    # monetary
    rev = sum(total),   # total revenue contribution
    raw = sum(gross),   # total gross profit contribution
    age = age[1],       # age group
    area = area[1],     # area code
    Status = Status(r,f,m,s,K)
  ) %>% data.frame      # 28584
nrow(A)
## [1] 28584



Preparing the Target Variables (Y)

Aggregate Feb’s Transaction by Customer
feb = filter(X0, date>= feb01) %>% group_by(cust) %>% 
  summarise(amount = sum(total))  # 16899
The Target for Regression - A$amount

Simply a Left Joint

A = merge(A, feb, by="cust", all.x=T)
The Target for Classification - A$buy
A$buy = !is.na(A$amount)
Summary of the Dataset
summary(A)
##      cust                 r               s               f         
##  Length:28584       Min.   : 1.00   Min.   : 1.00   Min.   : 1.000  
##  Class :character   1st Qu.:11.00   1st Qu.:47.00   1st Qu.: 1.000  
##  Mode  :character   Median :21.00   Median :68.00   Median : 2.000  
##                     Mean   :32.12   Mean   :61.27   Mean   : 3.089  
##                     3rd Qu.:53.00   3rd Qu.:83.00   3rd Qu.: 4.000  
##                     Max.   :92.00   Max.   :92.00   Max.   :60.000  
##                                                                     
##        m                rev             raw              age           
##  Min.   :    8.0   Min.   :    8   Min.   : -742.0   Length:28584      
##  1st Qu.:  359.4   1st Qu.:  638   1st Qu.:   70.0   Class :character  
##  Median :  709.5   Median : 1566   Median :  218.0   Mode  :character  
##  Mean   : 1012.4   Mean   : 2711   Mean   :  420.8                     
##  3rd Qu.: 1315.0   3rd Qu.: 3426   3rd Qu.:  535.0                     
##  Max.   :10634.0   Max.   :99597   Max.   :15565.0                     
##                                                                        
##      area                  Status          amount         buy         
##  Length:28584       菜雞      : 5698   Min.   :    8   Mode :logical  
##  Class :character   積優股    : 2489   1st Qu.:  454   FALSE:15342    
##  Mode  :character   冤大頭    :10048   Median :  993   TRUE :13242    
##                     提款機    :  822   Mean   : 1499                  
##                     沈思者    : 5640   3rd Qu.: 1955                  
##                     大雄      : 3005   Max.   :28089                  
##                     Almost葛屁:  882   NA's   :15342
The Association of Categorial Predictors
tapply(A$buy, A$age, mean) %>% barplot
abline(h = mean(A$buy), col='red')

tapply(A$buy, A$area, mean) %>% barplot(las=2)
abline(h = mean(A$buy), col='red')

X = subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
set.seed(2018); spl = sample.split(A$buy, SplitRatio=0.7)
c(nrow(A), sum(spl), sum(!spl))
## [1] 28584 20008  8576
cbind(A, spl) %>% filter(buy) %>% 
  ggplot(aes(x=log(amount))) + geom_density(aes(fill=spl), alpha=0.5)

A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
n = nrow(A2)
set.seed(2018); spl2 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(A2), sum(spl2), sum(!spl2))
## [1] 13242  9269  3973
cbind(A2, spl2) %>% 
  ggplot(aes(x=amount)) + geom_density(aes(fill=spl2), alpha=0.5)

save(Z, X, A, spl, spl2, file="data/tf2.rdata")



Loading & Preparing Data

TR = subset(A, spl)
TS = subset(A, !spl)


Classification Model

glm1 = glm(buy ~ ., TR[,c(2:8,10,12)], family=binomial()) 
summary(glm1)
## 
## Call:
## glm(formula = buy ~ ., family = binomial(), data = TR[, c(2:8, 
##     10, 12)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.5766  -0.8455  -0.7286   1.0460   1.9211  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -1.160e+00  9.124e-02 -12.712  < 2e-16 ***
## r                -1.289e-02  1.819e-03  -7.087 1.38e-12 ***
## s                 7.528e-03  1.578e-03   4.770 1.84e-06 ***
## f                 3.257e-01  1.735e-02  18.770  < 2e-16 ***
## m                -5.491e-05  3.011e-05  -1.824  0.06820 .  
## rev               3.900e-05  1.942e-05   2.009  0.04455 *  
## raw              -2.251e-04  8.488e-05  -2.652  0.00801 ** 
## agea25           -4.958e-02  8.684e-02  -0.571  0.56802    
## agea30           -8.203e-03  7.996e-02  -0.103  0.91829    
## agea35            4.574e-02  7.921e-02   0.577  0.56369    
## agea40            5.508e-02  8.131e-02   0.677  0.49816    
## agea45           -9.187e-04  8.463e-02  -0.011  0.99134    
## agea50            5.231e-03  9.325e-02   0.056  0.95527    
## agea55            1.597e-01  1.093e-01   1.460  0.14419    
## agea60            5.839e-02  1.176e-01   0.497  0.61937    
## agea65            2.504e-01  1.046e-01   2.394  0.01669 *  
## agena            -2.537e-01  1.459e-01  -1.738  0.08213 .  
## Status積優股      1.005e-01  7.061e-02   1.424  0.15454    
## Status冤大頭      1.081e-01  8.715e-02   1.240  0.21489    
## Status提款機     -1.290e+00  2.725e-01  -4.735 2.19e-06 ***
## Status沈思者      2.423e-01  8.257e-02   2.934  0.00334 ** 
## Status大雄        1.554e-01  1.126e-01   1.380  0.16767    
## StatusAlmost葛屁  6.434e-02  1.559e-01   0.413  0.67979    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27629  on 20007  degrees of freedom
## Residual deviance: 23359  on 19985  degrees of freedom
## AIC: 23405
## 
## Number of Fisher Scoring iterations: 6
pred =  predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm
##        predict
## actual  FALSE TRUE
##   FALSE  3755  848
##   TRUE   1684 2289
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts          # 0.69998
## [1] 0.7047575
colAUC(pred, TS$buy)                                   # 0.7556
##                     [,1]
## FALSE vs. TRUE 0.7518547


Regression Model

A2 = subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
lm1 = lm(amount ~ ., TR2[,c(2:6,8,9,11)])
summary(lm1)
## 
## Call:
## lm(formula = amount ~ ., data = TR2[, c(2:6, 8, 9, 11)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.83304 -0.22812  0.04853  0.28096  1.64243 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.1403704  0.0504979  22.583  < 2e-16 ***
## r             0.0000702  0.0003090   0.227  0.82030    
## s             0.0001173  0.0003123   0.376  0.70721    
## f             0.0256836  0.0017965  14.296  < 2e-16 ***
## m             0.5045943  0.0372711  13.539  < 2e-16 ***
## rev           0.0450307  0.0360945   1.248  0.21222    
## agea25        0.0737926  0.0251165   2.938  0.00331 ** 
## agea30        0.1204660  0.0230651   5.223 1.80e-07 ***
## agea35        0.1264592  0.0227496   5.559 2.79e-08 ***
## agea40        0.1382214  0.0232522   5.944 2.87e-09 ***
## agea45        0.1085828  0.0242698   4.474 7.77e-06 ***
## agea50        0.0787808  0.0264917   2.974  0.00295 ** 
## agea55        0.0703242  0.0312462   2.251  0.02443 *  
## agea60        0.0694822  0.0321119   2.164  0.03051 *  
## agea65       -0.0284007  0.0282282  -1.006  0.31439    
## agena         0.1124434  0.0395590   2.842  0.00449 ** 
## areaz106      0.0789586  0.0435321   1.814  0.06974 .  
## areaz110      0.0375241  0.0353641   1.061  0.28868    
## areaz114     -0.0111101  0.0371762  -0.299  0.76506    
## areaz115      0.0111809  0.0325803   0.343  0.73147    
## areaz221      0.0147066  0.0328140   0.448  0.65403    
## areazOthers   0.0249228  0.0349567   0.713  0.47589    
## areazUnknown  0.0105550  0.0388962   0.271  0.78612    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4216 on 9246 degrees of freedom
## Multiple R-squared:  0.291,  Adjusted R-squared:  0.2893 
## F-statistic: 172.5 on 22 and 9246 DF,  p-value: < 2.2e-16
r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) -  TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts)
## [1] 0.2909908 0.2575966


Prediction

load("data/tf0.rdata")
d0 = max(X0$date) + 1
B = X0 %>% 
  filter(date >= as.Date("2000-12-01")) %>% 
  mutate(days = as.integer(difftime(d0, date, units="days"))) %>% 
  group_by(cust) %>% summarise(
    r = min(days),      # recency
    s = max(days),      # seniority
    f = n(),            # frquency
    m = mean(total),    # monetary
    rev = sum(total),   # total revenue contribution
    raw = sum(gross),   # total gross profit contribution
    age = age[1],       # age group
    area = area[1],     # area code
    Status = Status(r,f,m,s,K)
  ) %>% data.frame      # 28584
nrow(B)
## [1] 28531
B$Buy = predict(glm1, B, type="response")
B$Rev = predict(lm1, B)
par(mfrow=c(1,2), cex=0.8)
hist(B$Buy)
hist(log(B$Rev,10))






par(mfrow=c(1,2), cex=0.8)
hist(B$Buy)
hist(log(B$Rev,10))






篩出目標顧客
#篩出目標顧客以做行銷
Target = subset(B, Status=="績優股"|Status == "菜雞")
Target%>%group_by(age)%>%summarise(
  Rev = sum(Rev)
)%>%arrange(desc(Rev))
## # A tibble: 11 x 2
##    age       Rev
##    <chr>   <dbl>
##  1 a35   476791.
##  2 a30   453647.
##  3 a40   367667.
##  4 a45   265742.
##  5 a25   235131.
##  6 a50   163905.
##  7 a20   116004.
##  8 a65    81776.
##  9 a55    78311.
## 10 a60    50466.
## 11 na     37602.
P0=A$ProbRetain
R0=A$PredRevenue 
m=0.20; a=20; b=15
curve(m*plogis((10/a)*(x-b)), 0, 30, lwd=2, ylim=c(0, 0.25),
      main=TeX('$m \\cdot Logis(10(x - b)/a)$'), ylab="f(x)")
abline(h=seq(0,0.2,0.05),v=seq(0,30,5),col='lightgrey',lty=2)

library(manipulate)
#使用manipulate套件進行行銷模擬(需複製到R Script)
# manipulate({
#   curve(m*plogis((10/a)*(x-b)), 0, 30, lwd=2, ylim=c(0, 0.25),
#         main = TeX('$m \\cdot Logis(10(x - b)/a)$'), ylab="f(x)")
#   abline(h=seq(0,0.2,0.05),v=seq(0,30,5),col='lightgrey',lty=2)
# },
# m = slider(0.05, 0.25,  0.20, step=0.01),
# a = slider(  10,   30,    20, step=1),
# b = slider(   4,   20,    15, step=1)
# ) 

#調整變數
#行銷情境1
m=0.2; a=20; b=20
m=0.2; a
## [1] 20
do.call(rbind, lapply(seq(5,40,0.5), function(c){
  p = m*plogis((10/a)*(c-b))
  Target %>% mutate(
    PI = ifelse(Buy<=(1-p), p, 1-Buy) * Rev - c
  ) %>%
    group_by(Status) %>% summarise(
      Cost = c,
      Group.Sz = n(),
      No.Target = sum(PI>0),
      AvgROI = mean(PI[PI>0]),
      TotalROI = sum(PI[PI>0])
    ) } ) ) %>% 
  ggplot(aes(x=Cost, y=TotalROI, col=Status)) +
  geom_line(size=1.2) +
  ggtitle("Cost Effeciency per Segment ")

#行銷情境二
m=0.2; a=20; b=11
do.call(rbind, lapply(seq(5,40,0.5), function(c){
  p = m*plogis((10/a)*(c-b))
  Target %>% mutate(
    PI = ifelse(Buy<=(1-p), p, 1-Buy) * Rev - c
  ) %>%
    group_by(Status) %>% summarise(
      Cost = c,
      Group.Sz = n(),
      No.Target = sum(PI>0),
      AvgROI = mean(PI[PI>0]),
      TotalROI = sum(PI[PI>0])
    ) } ) ) %>% 
  ggplot(aes(x=Cost, y=TotalROI, col=Status)) +
  geom_line(size=1.2) +
  ggtitle("Cost Effeciency per Segment ")